rm(list=ls())
library(survival)
library(survminer)
library(preprocessCore)
library(plyr)
library(ggpubr)

projects = TCGAbiolinks:::getGDCprojects()$project_id
ind = grep("TCGA",projects)
projects=projects[ind]

path = "Z:/Bioinformatics/ExternalDatabases/TCGAbiolinksAnalysis/UnnormalizedData/"
stor=c()
Gene = c("LAIR1")

for(project in projects){
  datPath = paste(path,paste(project,"Data.csv",sep=""),sep="/")
  metDatPath = paste(path,paste(project,"Metadata.csv",sep=""),sep="/")
  metadata = read.csv(metDatPath)
  dat = read.csv(datPath)

  ind = which(metadata$tissue.definition=="Primary solid Tumor")
  tumor = metadata$cases[ind]
  tumor = unlist(lapply(tumor,function(x){aa=gsub("-","\\.",x);return(aa)}))
  genes = apply(as.matrix(dat$X),1,function(x){aa=unlist(strsplit(x,"\\|"));return(aa[1])})
  ind = which(genes%in%Gene)
  tumorDat = dat[ind,]
  genes = apply(as.matrix(tumorDat$X),1,function(x){aa=unlist(strsplit(x,"\\|"));return(aa[1])})
  tumorDat$X = NULL
  tumorDat = t(tumorDat)
  colnames(tumorDat) = genes
  tumorDat = data.frame(tumorDat)
  tumorDat$proj = project
  
  stor=rbind(stor,tumorDat)

}


numericDat = stor
numericDat <- data.frame(apply(numericDat, 2, function(x) as.numeric(as.character(x))))
numericDat$proj = stor$proj


#interestingProjects = c("TCGA-COAD","TCGA-HNSC","TCGA-GBM","TCGA-LGG","TCGA-SKCM","TCGA-LUAD","TCGA-LUSC","TCGA-PAAD","TCGA-STAD","TCGA-OV")
#ind = which(projects%in%interestingProjects)
#projects = projects[ind]
survD = c()

for(jj in (1:(ncol(numericDat)-1))){
  AvgCollagenExpression = data.frame(numericDat[,jj])
  colnames(AvgCollagenExpression)="AvgCollExp"
  AvgCollagenExpression$patIds = row.names(stor)
  AvgCollagenExpression$proj = stor$proj
  idsMod = unlist(lapply(as.vector(AvgCollagenExpression$patIds), function(x) {
    aa = unlist(strsplit(x, "\\."))
    bb = paste(aa[1], aa[2], aa[3], sep = "-")
    return(bb)
  }))
  AvgCollagenExpression$modPatIds = idsMod
  
  
  
  
  plts <- vector("list", length(projects)*(ncol(numericDat)-1))
  counter = 1
  labss=c()
  
  for(project in projects){
    ind = which(AvgCollagenExpression$proj==project)
    AvgCollagenExpression_now = AvgCollagenExpression[ind,]
    
    #PROCESS THE CLINICAL DATA.....
    
    clinicalData = read.csv(paste(path, project, "Clinical.csv", sep = ""))
    
    ii = grep("bcr_patient_barcode", colnames(clinicalData))
    
    clinIDs = toupper(clinicalData$bcr_patient_barcode)

    
    #z_n = apply(as.matrix(numDat),2,function(x){mn = mean(x,na.rm = T);std = sd(x,na.rm = T);aa=((x-mn)/std);return(aa)})
    #colnames(z_n)=idsMod[2:length(idsMod)]
    
    
    #New tumor event after initial treatment
    ind_keep <-
      grep('days_to_new_tumor_event_after_initial_treatment',
           colnames(clinicalData))
    new_tum <- as.matrix(clinicalData[, ind_keep])
    new_tum_collapsed <- c()
    for (i in 1:dim(new_tum)[1]) {
      if (sum (is.na(new_tum[i, ])) < dim(new_tum)[2]) {
        m <- min(new_tum[i, ], na.rm = T)
        new_tum_collapsed <- c(new_tum_collapsed, m)
      } else {
        new_tum_collapsed <- c(new_tum_collapsed, 'NA')
      }
    }
    
    # do the same to death
    ind_keep <- grep('days_to_death', colnames(clinicalData))
    death <- as.matrix(clinicalData[, ind_keep])
    death_collapsed <- c()
    for (i in 1:dim(death)[1]) {
      if (sum (is.na(death[i, ])) < dim(death)[2]) {
        m <- max(death[i, ], na.rm = T)
        death_collapsed <- c(death_collapsed, m)
      } else {
        death_collapsed <- c(death_collapsed, 'NA')
      }
    }
    
    # and days last follow up here we take the most recent which is the max number
    ind_keep <- grep('days_to_last_follow_up', colnames(clinicalData))
    fl <- as.matrix(clinicalData[, ind_keep])
    fl_collapsed <- c()
    for (i in 1:dim(fl)[1]) {
      if (sum (is.na(fl[i, ])) < dim(fl)[2]) {
        m <- max(fl[i, ], na.rm = T)
        fl_collapsed <- c(fl_collapsed, m)
      } else {
        fl_collapsed <- c(fl_collapsed, 'NA')
      }
    }
    
    
    # and put everything together
    all_clin <-
      data.frame(new_tum_collapsed, death_collapsed, fl_collapsed)
    colnames(all_clin) <-
      c('new_tumor_days', 'death_days', 'followUp_days')
    
    
    # create vector with time to new tumor containing data to censor for new_tumor
    all_clin$new_time <- c()
    for (i in 1:length(as.numeric(as.character(all_clin$new_tumor_days)))) {
      all_clin$new_time[i] <-
        ifelse (is.na(as.numeric(
          as.character(all_clin$new_tumor_days)
        )[i]),
        as.numeric(as.character(all_clin$followUp_days))[i],
        as.numeric(as.character(all_clin$new_tumor_days))[i])
    }
    
    # create vector time to death containing values to censor for death
    all_clin$new_death <- c()
    for (i in 1:length(as.numeric(as.character(all_clin$death_days)))) {
      all_clin$new_death[i] <-
        ifelse (is.na(as.numeric(as.character(
          all_clin$death_days
        ))[i]),
        as.numeric(as.character(all_clin$followUp_days))[i],
        as.numeric(as.character(all_clin$death_days))[i])
    }
    
    # create vector for death censoring
    table(clinicalData$vital_status)
    
    
    all_clin$death_event <-
      ifelse(clinicalData$vital_status == 'Alive', 0, 1)
    
    #finally add row.names to clinical
    rownames(all_clin) <- toupper(clinicalData$bcr_patient_barcode)
    all_clin$Age = clinicalData$age_at_index
    
    
    
    all_clin$modPatIds = row.names(all_clin)
    AvgCollagenExpression_now = join(AvgCollagenExpression_now,all_clin, by="modPatIds")
    
    ind = which(is.na(AvgCollagenExpression_now$new_death))
    if(length(ind)>0){
      AvgCollagenExpression_now = AvgCollagenExpression_now[-ind,]
    }
    quan = quantile(AvgCollagenExpression_now$AvgCollExp)
    ind = which(AvgCollagenExpression_now$AvgCollExp <= quan[2])
    ind1 = which(AvgCollagenExpression_now$AvgCollExp >= quan[4])
    
    AvgCollagenExpression_now = AvgCollagenExpression_now[c(ind, ind1), ]
    event_rna <-
      ifelse(AvgCollagenExpression_now$AvgCollExp >= quan[4],
             "HighExpression",
             "LowExpression")
    if(length(unique(event_rna))>1){
      cox.ph <-
        coxph(Surv((AvgCollagenExpression_now$new_death / 30), AvgCollagenExpression_now$death_event) ~ event_rna, data = AvgCollagenExpression_now)
      coeffs = coef(summary(cox.ph))
      
     # txt = paste(
    #    paste("HR(high):", round(coeffs[2], digits = 2), sep = ""),
      #  paste("pr(HR):", round(coeffs[5], digits = 2), sep = ""),
       # paste("n(high):", length(ind1), sep = ""),
      #  paste("n(low):", length(ind), sep = ""),
      #  sep = ' '
      #)
      
      txt = paste(
        paste("HR(high):", round(coeffs[2], digits = 2), sep = ""),
        paste("pr(HR):", round(coeffs[5], digits = 2), sep = ""),
        sep =  " "
      )
      #survD = rbind(survD,c(project,round(coeffs[2], digits = 2),round(coeffs[5], digits = 2),length(ind1),length(ind)))
      survD = rbind(survD,c(project,coeffs[2],coeffs[5],length(ind1),length(ind)))
      fit <-
        survfit(Surv((AvgCollagenExpression_now$new_death / 30), AvgCollagenExpression_now$death_event) ~ event_rna, data = AvgCollagenExpression_now)
      x_q = quantile(na.exclude((AvgCollagenExpression_now$new_death / 30)))
      pval = surv_pvalue(fit)$pval
      if(pval<0.06){
        #gp = ggsurvplot(fit, data = AvgCollagenExpression_now, pval = T,pval.size = 5,font.tickslab = c(10, "plain", "black"))
        #gp = ggsurvplot(fit, data = AvgCollagenExpression_now, pval = txt,pval.size = 5,font.tickslab = c(10, "plain", "black"),palette = c("red", "black"),legend="none", pval.coord = c(0, 0.03))
        gp = ggsurvplot(fit, data = AvgCollagenExpression_now, pval = F,pval.size = 5,font.tickslab = c(10, "plain", "black"),palette = c("red", "black"),legend="none")
        gp$plot = gp$plot +theme(axis.title.x=element_blank(),axis.title.y=element_blank(),legend.position="none")
        plts[counter] = gp
        counter= counter+1
        project = gsub("TCGA-","",project)
        labss = c(labss,paste(colnames(numericDat)[jj],project,sep="_"))
      }
    }
    

  }
}



plts1 = plts[1:length(labss)]
labss1 = apply(as.matrix(labss),1,function(x){aa=unlist(strsplit(x,"_"))[2];return(aa)})
colnames(survD)=c("project","HR(HIGH)","pr(HR)","n(high)","n(low)")
#write.csv(survD,"U:/NC410Manuscript/Results/revision/SurvivalStatsLAIR1New.csv")


ind = which(labss1%in%c("HNSC","THCA","THYM","BRCA","STAD","SKCM"))
#ind = which(labss1%in%c("LGG","ESCA","UVM","LUSC"))

plts1=plts1[ind]
labss1=labss1[ind]


#figure <- ggarrange(plotlist=plts1,ncol = 6, nrow =1,labels=labss1,font.label = list(size = 20, color = "black"),label.y=1,label.x = 0.5)
figure <- ggarrange(plotlist=plts1,ncol = 6, nrow =1)

png("U:/NC410Manuscript/Results/revision/SurvivalLair2.png", width = 20, height =4, units = 'in', res = 300)
annotate_figure(figure,
                bottom = text_grob("Time in Months", color = "black",size = 30),
                left = text_grob("Survival Probability", color = "black", rot = 90,size=30)
)
dev.off()


